Archive/shuffled_ttest v1.11.r

shuffled_ttest.Version = 1.11;
#initial program for running the shuffling of t-tests
#Data should be organized with Var 1 in column 1 and Var 2 in column 2

#runs independant sample t-test or paired t test, a number of times, with the data being segmented by minimum number of participants needed to reach significance

#####Parameters--------------

#data_set should refer to a dataset available in the global envir
#shuffle_amount - denotes how many times the data will be shuffled
#alpha - desired alpha value
#paired - optional TRUE/FALSE for Paired Sample t-Test, default is False
#csvFileName, if included allows the results to be added to a separate data.frame - Must be in " "


#Required packages
packages = c("tictoc");

#use this function to check if each package is on the local machine
#if a package is installed, it will be loaded
#if any are not, the missing package(s) will be installed and loaded
package.check <- lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
        install.packages(x, dependencies = TRUE)
        library(x, character.only = TRUE);
    }
})
######End Packages

######Find Base n--------------------------------------------------

find_base_n <- function(data_set, alpha, method) {
#save variable
alpha <- alpha;
#initialize pvalues variable as vector
pvalues <- vector(mode="double", length=2); 
pvalues[1:2] = 1;

#Convert piped in dataset to a data frame
data_set <- as.data.frame(
	data_set, 
	row.names = NULL, 
	optional = FALSE,
	cut.names = FALSE, 
	col.names = names(data_set), 
	fix.empty.names = TRUE,
	stringsAsFactors = FALSE);

	
#######Incremental t-Tests------------------------
#i.e. 1-2, 1-3, 1-4, 1-5, etc.. 
	for(k in 2:nrow(data_set)) { 
		
	#Paired Sample = FALSE - Run independant Sample
		if (method == FALSE) {
		
			#Makes sure there is variance prior to running ind-sample t-test			
			#Use apply to get variance values rows 1:k, in columns 1 and 2
			#Use all to compare variance to 0
			if (all(apply(data_set[1:k,], 2, var) != 0 )){
				
				#Saves iterative p values in a vector 
				pvalues[k] <- t.test(data_set[1:k,1],data_set[1:k,2])$p.value;
			}
			else {
				pvalues[k] <- 2;
			}
		} #end if
		
		
	#Paired Sample = TRUE - Run paired Sample
		if (method == TRUE) {
		
			#Makes sure there is variance before running paired sample t-test
			#Use apply to get variance values in both rows and colums
			#Use all to compare variances to 0
			if (sum(apply(data_set[1:k,], 1, var)) > 0 & 
				sum(apply(data_set[1:k,], 2, var)) > 0 ){
				
				#tryCatch---
				tryCatch({ #checks for errors
				#Saves iterative p values in a vector 
				pvalues[k] <- t.test(data_set[1:k,1],data_set[1:k,2], paired = TRUE)$p.value;
				},
				
				#If there is an error, set current p value = 2
				error = function(err) { 
					pvalues[k] <- 2;
				}); #End tryCatch---
			}
			
			#If there is not variance, set p value = 2
			else {
				pvalues[k] <- 2;
			}
		} #end if
} #End For Loop	

	i=3; #starts with atleast 2 participants
	
######Minimum Sig. Value------------------
	#Loop through pvalues vector from above to find first significant p value
	while(i <= length(pvalues)) {
		#For debugging, saves list of pvalues to Global env
		#assign('pvalues',pvalues, envir=.GlobalEnv);
		
		###Minimum base_n allowed----------
		min_n <- 10;

		#Return number of participants needed for significance with a minimum number of participants examined
		if (pvalues[i] <= alpha & i >= min_n) {
			return(i);
			}
		#Continues loop if not in the current row
		else {
			i <- i + 1;
		}	
	} #End While loop
	#if no significant p values are found, return 0.
	return(0);
}


######Shuffled t-Test Function---------------------------
shuffled_ttest <- function(data_set, shuffle_amount, alpha, paired=FALSE, csvFileName){

######Variable setup-----------------------------------

#convert data to data.frame
data_set <- as.data.frame(data_set, row.names = NULL, optional = FALSE,
              cut.names = FALSE, col.names = names(data_set), fix.empty.names = TRUE,
              stringsAsFactors = default.stringsAsFactors());

shuffle_amount <- shuffle_amount;

tic("Run time") #start timer----

###Alpha Check----
#if alpha parameter is included, save the variable
if (!missing(alpha)) {
	#Set alpha value from input
	alpha <- alpha;
	}
	
	else {
		alpha <- .05;
	}

###Paired t-Test Check----
#Option for Paired-Sample t-test - Default is FALSE
if (paired == FALSE) {
	paired_test <- FALSE;
	t.method <- "Independant Sample t-Test";
	}
	else {
	paired_test <- TRUE;
	t.method <- "Paired Sample t-Test";}
	
###CSV Output Check----
#if csvFileName parameter is included, save the variable
if (!missing(csvFileName)) {
	#Appends '.csv' and saves desired file name as variable csvFileName
	csvFileName <- paste(csvFileName,".csv",sep="");
	}
	

#Create statistical output data frame named "results", with 7 headers, for ind. sampled t-tests, clears old data with each new run
results <- data.frame("iteration" = numeric(0), "sample" = numeric(0),"range" = character(0), "base n" = numeric(0), "t" = double(0),"df" = double(0),"p value" = double(0), stringsAsFactors = FALSE);

group1_col <- 1; #group 1 column = variable 1, change as needed
group2_col <- 2; #group 2 column = variable 2, change as needed

sum_sig_p <- 0; #used to keep track of number of significant findings
sum_NA <- 0; #counter for no variance comparisons
	
########End Variable Setup---------------------------------	
	

#Warning for large shuffling amounts
if (shuffle_amount > 50) {
	print("Please wait...");
	}


######Shuffling and replication ------------------------------
	#Shuffles the data a number of times = to shuffle amount, running the replication tests for each iteration
	for (i in 1:shuffle_amount) {
				
		cycle <- 1; #keep track of replications within shuffles
		
		x<-1; #Used for lower bounds of current selection - resets on new iteration
		
		#shuffles data set using 'sample()' 
		data_set <- data_set[sample(1:nrow(data_set)),];
		
		#assigns shuffled data to global env
		assign("shuffled_data",data_set, envir=.GlobalEnv);
		
		#finds base n for each iteration
		base_n <- find_base_n(data_set, alpha, paired_test); 
		
		#If there are no significant findings, defaults to all data
		if (base_n == 0) {
			base_n <- nrow(data_set);
			}
		#y=set to min number of participants needed for each shuffle
		y <- base_n; 


	#Repeats while the current selection of participants is less than the max number of participants - does not run less than base_n number of participants, so there may be missing data at the end
		#TODO - Add option for include/exclude uneven N
	while (y <= nrow(data_set)) {
	
	#Ind sample t-test---------------------
		if (paired_test == FALSE) {
			
			#Check that variance is greater than 0 in current selections			
			#Use apply to get variance values of rows x:y, in columns 1 and 2
			#Use all to compare variance to 0		
			if (all(apply(data_set[x:y,], 2, var) != 0 )){
								
				#t test on Group 1 and Group 2 using current selection of participants x through y
				ttestresults <- t.test(data_set[x:y,group1_col],data_set[x:y,group2_col]);
				
				#if the test is signficant, increase count by 1
				if (ttestresults$p.value <= alpha) {
					sum_sig_p <- sum_sig_p + 1;
					}
			
				#add statistical output to new row in results data.frame, rounding down the decimals
				#Organized as [iteration, cycle number, range, t-test statistic, degrees of freedom, p value].
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n,round(ttestresults$statistic,3), round(ttestresults$parameter,4), round(ttestresults$p.value,5));
			}
					
		#If there is 0 variance, report NA for test statistics
		else {
			results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n,"NA", "NA", "NA");
			sum_NA <- sum_NA + 1;
			}
		} #end if

			
	#Paired sample t-test--------------------
		if (paired_test == TRUE) {
			#Check that variance is greater than 0 in current selections			
			#Use apply to get variance values within-subject and between subject - 1=rows, 2=cols
			#Use all to compare variance to 0	
			if (#all(apply(data_set[x:y,], 1, var) != 0) &
				#all(apply(data_set[x:y,1], 2, var) != 0) &
				all(apply(data_set[x:y,], 2, var) != 0)){
				
				#debug variable values when errors occur
				assign("x",x, envir=.GlobalEnv);
				assign("y",y, envir=.GlobalEnv);
				assign("base n",base_n, envir=.GlobalEnv);
				
				#Catch data is constant errors
				tryCatch({ 
				#t test on Group 1 and Group 2 using current selection of participants x through y
				ttestresults <- t.test(data_set[x:y,group1_col],data_set[x:y,group2_col],paired=TRUE);
				
				},
				error = function(err) {
					sum_NA <- sum_NA + 1;
					ttestresults$p.value <- 9;
				})
			
				
				#if the test is signficant, increase count by 1
				if (is.null(ttestresults$p.value) == FALSE &
					ttestresults$p.value <= alpha) {
					sum_sig_p <- sum_sig_p + 1;
				}
			
				#add statistical output to new row in results data.frame, rounding down the decimals
				#Organized as [iteration, cycle number, range, t-test statistic, degrees of freedom, p value].
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n,round(ttestresults$statistic,3), round(ttestresults$parameter,4), round(ttestresults$p.value,5));
			}	
				
			else {
				#Skips t-test and report NA findings when variance = 0
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n,"NA", "NA", "NA");
							sum_NA <- sum_NA + 1;
			}	
		} #end if
			
	#Selects new range of participants of length base_n and increase cycle count
	x<-x+base_n; 
	y<-y+base_n;
	cycle <- cycle + 1;
	}#end while loop
	
	end_time=Sys.time(); #End Timer
}#End for loop
	
#######End shuffling and replication---------------------	

	
	
	
#######Export--------------------------------------------
	#Saves results to custom external file if option is include in parameters, if none included in argument, defaults output to 'results.csv'
	if (missing(csvFileName)){
		assign('results',results, envir=.GlobalEnv);
		write.csv(results, file="results.csv", row.names=TRUE);
		} 
		
		#if there IS a name included
		else { 
		
		#writes to a csv file using the variable output_fname
		write.csv(results, file=csvFileName, row.names=TRUE);
		}
#######End Export ------------------------------------------
	
	
	
	
#######Output-----------------------------------------------
	#shows the results in console if there are less than 50 rows
	if (nrow(results) < 50) {
		show(results);
	}
	toc(); #total time taken
	
	#Summary of results - cat() allows use of \n for line breaks
	cat(t.method,
	"\nSignificant findings (p < ",alpha,"): ", sum_sig_p, "/", nrow(results), " (",round((sum_sig_p/nrow(results))*100,2),"%)",
	"\nZero Variances: ",sum_NA, " | Mean Base n: ", mean(results[,4]),sep="");
}
baileymh/Shuffle documentation built on Sept. 4, 2019, 8:43 a.m.